Neural composer

In [ ]:
!sudo dpkg -i ~/tensorflow/personal-scratch/kake/cudnn/libcudnn7_7.3.1.20-1+cuda9.0_amd64.deb
!pip install -I --user tensorflow-gpu pretty_midi pypianoroll pandas
In [1]:
import IPython
import pretty_midi
import numpy as np
from time import time
import matplotlib.pyplot as plt
import pypianoroll as pproll
import tensorflow as tf
import pandas as pd
import numpy as np

tf.enable_eager_execution()
%matplotlib inline

Problem formulation

Task framed as an autoregressive multilabel classification (That is, at each time step do a a multilabel classification conditioned on the classification choices done already).

While this is quite straight forward, there is a discrepancy between training (next step prediction) and the true objective (long term generation).

Ended up doing it anyway, at the end are more information on other techniques attempted, and some not attempted.

In [2]:
BATCH_SIZE = 32
NUM_LAYERS = 2
RNN_SIZE = 256
FEATURE_SIZE = 128
EPSILON = 1e-5
L2_WEIGHT = .001
EPOCHS = 5000
PIANOROLL_PATH = 'songs/*.csv'
COMPOSERS = ['mz', 'br', 'de', 'ba']
SAVE_DIR = 'composer/'

Dataset preprocessing:

  • Assume pretty_midi piano_roll matrices
  • Assume first two letters of file name identifies composer
  • Remove first column and first row (both are indices) and add final marker (all keys 128)
  • Batch into mini batches of BATCH_SIZE songs
  • Keep record of each matrix length and pad all matrices to longest matrix in mini batch
  • Return mini-batches of shape BATCH_SIZE x NUM_KEYS x MAX_TIME_STEPS
  • Iterate over songs EPOCHS times

As the dataset is very small, we don't bother with truncated backpropagation and cache the pre batched matrices in memory.

In [3]:
with tf.device('cpu:0'):
    mapping = tf.contrib.lookup.index_table_from_tensor(tf.constant(COMPOSERS))

    def parse_numeric_csv(dataset):
        ''' numeric CSV parser '''
        return (dataset
                .map(lambda x: tf.sparse.to_dense(tf.string_split([x], '\n'), '')[0])
                .map(lambda x: tf.sparse.to_dense(tf.string_split(x, ','), '0'))
                .map(tf.strings.to_number))

    def get_composer_ids(x):
        ''' filename to composer ids '''
        file_name = tf.sparse.to_dense(tf.strings.split([x], '/'), '')[0, -1]
        prefix = tf.strings.substr(file_name, 0, 2)
        ids = mapping.lookup(prefix)
        return ids

    paths = tf.data.Dataset.list_files(PIANOROLL_PATH)

    files = (paths
            .map(tf.read_file)
            .apply(parse_numeric_csv)
            .map(lambda x: tf.concat([x[1:, 1:], 128. * tf.ones((128, 1))], axis=1))
            .map(lambda x: (x, tf.shape(x)[-1])))

    composers = paths.map(get_composer_ids)

    ds = tf.data.Dataset.zip((composers, files))
WARNING:tensorflow:From /home/finn/.local/lib/python3.6/site-packages/tensorflow/python/ops/sparse_ops.py:1165: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
In [4]:
ds = (ds.cache()
        .repeat(EPOCHS)
        .padded_batch(BATCH_SIZE, padded_shapes=((), ([FEATURE_SIZE, -1], ()))))

ds = ds.apply(tf.data.experimental.prefetch_to_device('gpu:0'))

Model

  • Problem formulated as Autoregressive multiple classification problem
  • Solved with a Long-Short Term Memory recurrent neural network
  • For each time step independently classify 128 keys using the cross entropy between model prediction and active keys at next time step
  • Initialise LSTM rollout with a general trainable initialisation or an independent initialisation per composer
In [5]:
class BinarizedNeuralComposer(tf.keras.Model):
    def __init__(self, rnn_size, feature_size, composers):
        super(BinarizedNeuralComposer, self).__init__()
        self.rnn_size = rnn_size
        self.feature_size = feature_size
        self.composers = composers

        self.rnn = tf.contrib.cudnn_rnn.CudnnLSTM(NUM_LAYERS, self.rnn_size, dropout=0.2)
        self.comp_emb_c = tf.get_variable(
            'comp_emb_c',(len(self.composers) + 1, NUM_LAYERS, self.rnn_size))
        self.comp_emb_h = tf.get_variable(
            'comp_emb_h',(len(self.composers) + 1, NUM_LAYERS, self.rnn_size))
        self.projection = tf.layers.Dense(self.feature_size)

    def call(self, data, comp_id=-1, state=None, training=True):
        if state is None:
            h = tf.nn.embedding_lookup(params=self.comp_emb_h, ids=comp_id)
            c = tf.nn.embedding_lookup(params=self.comp_emb_c, ids=comp_id) # BATCH x LAYERS x RNN_SIZE
            h = tf.transpose(h, [1, 0, 2])
            c = tf.transpose(c, [1, 0, 2])
            state = (h, c)
        
        out, state = self.rnn(data, state, training=training)
        logits = self.projection(out)

        return logits, state
In [6]:
model = BinarizedNeuralComposer(RNN_SIZE, FEATURE_SIZE, COMPOSERS)
optimizer = tf.train.AdamOptimizer(.001)

root = tf.train.Checkpoint(optimizer=optimizer,
                           model=model,
                           optimizer_step=tf.train.get_or_create_global_step())

checkpoint = tf.train.latest_checkpoint(SAVE_DIR)
status = root.restore(checkpoint)
print(checkpoint)
composer/final_2_256-2
In [7]:
reg = tf.contrib.layers.l2_regularizer(L2_WEIGHT)

for comp_id, (data, length) in ds:
    begin = time()
    data = tf.transpose(data, [2, 0, 1]) # batch x keys x time -> time x batch x keys
    data = tf.to_float(tf.not_equal(data, tf.zeros_like(data))) # binarize

    x = data[:-1]
    y = data[1:]
    
    seq_length = length - 1

    length_mask = tf.expand_dims(tf.transpose(tf.to_float(tf.sequence_mask(seq_length))), -1)

    # Cool scheduled sampling technique which didn't fix much
    '''
    y_hat, _ = model(x, comp_id)
    x_hat = tf.concat([x[:1], tf.nn.sigmoid(y_hat[:-1])], axis=0)
    x_hat = tf.where(x_hat > .1, tf.ones_like(x_hat), tf.zeros_like(x_hat))
    x_hat = tf.where(tf.random.uniform(tf.shape(x_hat)) > .5, x, x_hat)
    '''
    x_hat = x

    # Train generalist / specialist 50 / 50
    comp_id = tf.where(.5 > tf.random.uniform(tf.shape(comp_id)), comp_id, -1 * tf.ones_like(comp_id))

    with tf.GradientTape() as tape:
        y_hat, _ = model(x_hat, comp_id)

        loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=y, logits=y_hat, weights=length_mask)
        # loss += tf.contrib.layers.apply_regularization(reg, [model.projection.weights[0], model.comp_emb])
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables),
                              global_step=tf.train.get_or_create_global_step())

    if tf.train.get_or_create_global_step().numpy() % 100 == 0:
        print(tf.train.get_or_create_global_step().numpy(), time() - begin, tf.reduce_sum(loss).numpy())
13439 1.7697668075561523 0.017067058
13440 0.47074031829833984 0.20667078
13441 0.3589041233062744 0.15636821
13442 0.4713315963745117 0.16427551
13443 0.5055408477783203 0.14895837
13444 0.46884894371032715 0.14230567
13445 0.3617269992828369 0.13015598
13446 0.46538376808166504 0.12652126
13447 0.5380077362060547 0.113765076
13448 0.5326080322265625 0.10551794
13449 0.3926098346710205 0.099452555
13450 0.5235800743103027 0.09895464
13451 0.5344021320343018 0.09194707
13452 0.5445401668548584 0.08641332
13453 0.4169485569000244 0.08314595
13454 0.4767873287200928 0.08329182
13455 0.47605276107788086 0.08135775
13456 0.4435727596282959 0.07947672
13457 0.461500883102417 0.07751942
13458 0.5207424163818359 0.07681267
13459 0.4665994644165039 0.07744682
13460 0.4463956356048584 0.075534716
13461 0.5095610618591309 0.075674176
13462 0.47751855850219727 0.073943414
13463 0.47843384742736816 0.074454345
13464 0.39946413040161133 0.07315041
13465 0.4837486743927002 0.0732946
13466 0.4774799346923828 0.07120707
13467 0.5403831005096436 0.07057239
13468 0.42272019386291504 0.07266184
13469 0.4868447780609131 0.07137904
13470 0.5154192447662354 0.06980222
13471 0.4747600555419922 0.06947798
13472 0.40338850021362305 0.0698712
13473 0.4770796298980713 0.06990183
13474 0.47275376319885254 0.06900042
13475 0.5397777557373047 0.06772424
13476 0.3667266368865967 0.06715616
13477 0.5316894054412842 0.069975816
13478 0.4920058250427246 0.06749991
13479 0.5363726615905762 0.06657335
13480 0.401644229888916 0.065814905
13481 0.45397162437438965 0.069056585
13482 0.47008490562438965 0.0663356
13483 0.517918586730957 0.06525102
13484 0.36614465713500977 0.065520294
13485 0.4726674556732178 0.06766372
13486 0.48261523246765137 0.06534319
13487 0.4645211696624756 0.06447838
13488 0.3961336612701416 0.064725965
13489 0.5207593441009521 0.06606007
13490 0.4591636657714844 0.0648478
13491 0.46866273880004883 0.06357171
13492 0.3714756965637207 0.06405749
13493 0.49236297607421875 0.06521335
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-7-569125dd4ac2> in <module>
     30         loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=y, logits=y_hat, weights=length_mask)
     31         # loss += tf.contrib.layers.apply_regularization(reg, [model.projection.weights[0], model.comp_emb])
---> 32     grads = tape.gradient(loss, model.trainable_variables)
     33     optimizer.apply_gradients(zip(grads, model.trainable_variables),
     34                               global_step=tf.train.get_or_create_global_step())

~/.local/lib/python3.6/site-packages/tensorflow/python/eager/backprop.py in gradient(self, target, sources, output_gradients)
    899         nest.flatten(target),
    900         flat_sources,
--> 901         output_gradients=output_gradients)
    902 
    903     if not self._persistent:

~/.local/lib/python3.6/site-packages/tensorflow/python/eager/imperative_grad.py in imperative_grad(tape, target, sources, output_gradients)
     62       target,
     63       sources,
---> 64       output_gradients)

~/.local/lib/python3.6/site-packages/tensorflow/python/eager/backprop.py in _gradient_function(op_name, attr_tuple, num_inputs, inputs, outputs, out_grads)
    115     return [None] * num_inputs
    116 
--> 117   return grad_fn(mock_op, *out_grads)
    118 
    119 

~/.local/lib/python3.6/site-packages/tensorflow/python/ops/cudnn_rnn_grad.py in _cudnn_rnn_backward(op, *grads)
     45       rnn_mode=op.get_attr("rnn_mode"),
     46       input_mode=op.get_attr("input_mode"),
---> 47       direction=op.get_attr("direction"))
     48 
     49 

~/.local/lib/python3.6/site-packages/tensorflow/python/ops/gen_cudnn_rnn_ops.py in cudnn_rnn_backprop(input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, rnn_mode, input_mode, direction, dropout, seed, seed2, name)
    314         output_h_backprop, output_c_backprop, reserve_space, "rnn_mode",
    315         rnn_mode, "input_mode", input_mode, "direction", direction, "dropout",
--> 316         dropout, "seed", seed, "seed2", seed2)
    317       _result = _CudnnRNNBackpropOutput._make(_result)
    318       return _result

KeyboardInterrupt: 
In [8]:
root.save(SAVE_DIR + 'final_2_256')
Out[8]:
'composer/final_2_256-3'
In [7]:
# Helper functions copied from github.com/zehsilva/neural-composer-assignement

def piano_roll_to_pretty_midi(piano_roll, fs=100, program=1):
    '''Convert a Piano Roll array into a PrettyMidi object
     with a single instrument.
    Parameters
    ----------
    piano_roll : np.ndarray, shape=(128,frames), dtype=int
        Piano roll of one instrument
    fs : int
        Sampling frequency of the columns, i.e. each column is spaced apart
        by ``1./fs`` seconds.
    program : int
        The program number of the instrument.
    Returns
    -------
    midi_object : pretty_midi.PrettyMIDI
        A pretty_midi.PrettyMIDI class instance describing
        the piano roll.
    '''
    notes, frames = piano_roll.shape
    pm = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=program)

    # pad 1 column of zeros so we can acknowledge inital and ending events
    piano_roll = np.pad(piano_roll, [(0, 0), (1, 1)], 'constant')

    # use changes in velocities to find note on / note off events
    velocity_changes = np.nonzero(np.diff(piano_roll).T)

    # keep track on velocities and note on times
    prev_velocities = np.zeros(notes, dtype=int)
    note_on_time = np.zeros(notes)

    for time, note in zip(*velocity_changes):
        # use time + 1 because of padding above
        velocity = piano_roll[note, time + 1]
        time = time / fs
        if velocity > 0:
            if prev_velocities[note] == 0:
                note_on_time[note] = time
                prev_velocities[note] = velocity
        else:
            pm_note = pretty_midi.Note(
                velocity=prev_velocities[note],
                pitch=note,
                start=note_on_time[note],
                end=time)
            instrument.notes.append(pm_note)
            prev_velocities[note] = 0
    pm.instruments.append(instrument)
    return pm

def visualize_piano_roll(pianoroll_matrix,fs=5):
    """ input: piano roll matrix with shape (number of notes, time steps)
        effect: generates a nice graph with the piano roll visualization
    """
    if(pianoroll_matrix.shape[0]==128):
        pianoroll_matrix=pianoroll_matrix.T.astype(float)
    track = pproll.Track(pianoroll=pianoroll_matrix, program=0, is_drum=False, name='piano roll')   
    # Plot the piano-roll
    fig, ax = track.plot(beat_resolution=fs)
    plt.show()

Failed attempts and other directions

The model trained very slow, so I tried a few tricks to simplify the problem. I also through a little about making the training objective more similar to the generative use case.

Things tried

  • Full regression on pitch - Couldn't find sweetspot between absolute junk and total memorization.
  • Weight training signal of keys on / off according to frequency per key - Very unstable training
  • Predict number of activated keys at next step and take that many - Didn't train any faster
  • predict 3 steps ahead as auxilliary task - Didn't warrant the extra complexity, and didn't use the predictions for anything at test time
  • Scheduled sampling for recovery (inject predictions during trainig) - Very surprised this didn't work better!

Things that would be fun to try

  • Formulate as a sequence GAN and train as a reinforcement learning policy. Too many moving parts!
In [10]:
# Utils for reading piano roll file and sample songs from model

def show_and_tell(path, composer_ids=-1, length=30, fs=5):
    print('Original song')
    song = read(path)
    render_pianoroll(song[:length * fs,0,:])
    render_audio(song, length)

    print('Generalist')
    composed = compose(song, [-1], start_seconds=5, length_seconds=length - 5, fs=fs).numpy()
    render_pianoroll(composed[:length * fs,0,:])
    render_audio(composed, length)
    
    for composer_id in composer_ids:
        print('Specialist:', COMPOSERS[composer_id])
        composed = compose(song, [composer_id], start_seconds=5, length_seconds=length - 5, fs=fs).numpy()
        render_pianoroll(composed[:length * fs,0,:])
        render_audio(composed, length)

def read(path):
    piano_roll = pd.read_csv(path) # Read CSV
    piano_roll = piano_roll.values[:,1:] # Remove indices
    piano_roll = tf.transpose(piano_roll)
    piano_roll = tf.expand_dims(piano_roll, 1)
    piano_roll = tf.minimum(1., piano_roll) # binarize
    
    return piano_roll.numpy()

def render_pianoroll(song):
    plt.figure(figsize=(15,5))
    plt.imshow(song.transpose(), cmap='hot')
    plt.show()
    visualize_piano_roll(song)

def render_audio(song, length):
    pm = piano_roll_to_pretty_midi(song[:,0,:].transpose(), fs=5)
    signal = pm.synthesize()
    IPython.display.display(IPython.display.Audio(signal[:44100*length], rate=44100))

def compose(song, composer_id, start_seconds=5, length_seconds=20, fs=5):
    seed = song[:fs * start_seconds]

    y_hats, state = model(seed, composer_id, training=False)

    y_hat = y_hats[-1:]
    y_hat = round_to_zero(y_hat)
    result = [seed, y_hat]

    for _ in range(fs * length_seconds):
        y_hat, state = model(y_hat, composer_id, state, training=False)
        y_hat = round_to_zero(y_hat)
        result.append(y_hat)
    return tf.concat(result, axis=0)

def round_to_zero(x, threshold=.1):
        x = tf.nn.sigmoid(x)
        return tf.where(x > threshold, tf.ones_like(x), tf.zeros_like(x))
In [11]:
show_and_tell('songs/bach_847.csv', composer_ids=[COMPOSERS.index('ba'), COMPOSERS.index('de')])
Original song
Generalist
Specialist: ba
Specialist: de
In [ ]: